'''
@Author: ysz
@Date: 2019-12-27 09:15:23
@LastEditors  : ysz
@LastEditTime : 2020-01-14 09:46:50
@Description: 爬取58同城租房的个人房源的房源基本信息
'''
import json
import html
import requests
import re
import os.path
import os
import time
import datetime
import random
from bs4 import BeautifulSoup
from lxml import etree

from jiemi_help import Jiemi
from mysqlhelp import MysqlHelper
from user_agent_help import User_Agent_Help
from common_object import TongCheng

#实例化解码类的对象
jm_help=Jiemi()
mysqlhelp=MysqlHelper()
uahelp=User_Agent_Help()
user_agent=uahelp.get_useragent()

class Rent_House(TongCheng):

    # ----获取列表页中关于详情页的url
    def page_list(self,url,referer):
        list_html=super().page_list(url,referer)
        #获取当前列表页中的详情页的url列表
        detail_url_list=list_html.xpath('//div[@class="des"]/h2/a/@href')
        #遍历url列表，获取详情页中的内容
        for detail_url in detail_url_list:
            # https://sy.58.com/hezu/40497944696325x.shtml?shangquan=daoyi&shangquanId=10765&dataSource=1&iuType=z_0&PGTID=0d3090a7-000b-c574-835c-a08c4d1e2a17&ClickID=8&adtype=3
            # https://sy.58.com/hezu/40597159865221x.shtml?shangquan=daoyi&shangquanId=10765&dataSource=1&iuType=z_0&PGTID=0d3090a7-000b-c574-835c-a08c4d1e2a17&ClickID=6&adtype=3
            # https://sy.58.com/zufang/40509397696551x.shtml?shangquan=shijixincheng&shangquanId=11973&dataSource=1&iuType=z_0&PGTID=0d3090a7-000b-c574-835c-a08c4d1e2a17&ClickID=4&adtype=3
            if 'https://sy.58.com/' in detail_url:
                time.sleep(random.randint(120,240))
                self.one_detail_page(detail_url,url)

        
    #爬取详情页的数据
    def one_detail_page(self,url,referer):
        print('==='*15)
        print(url,'详情页的地址')
        # global user_agent
        headers={
            'cookie':self.headers['cookie'],
            'User-Agent':self.headers['User-Agent'],
            'referer': referer,
        }
        res=requests.get(url,headers=headers)
        rest=etree.HTML(res.text)
        if '你要找的页面不在这个星球上！' in rest.xpath('//h1/text()')[0].strip():
            return 
        #将&#x958f;室变为閏室
        text = html.unescape(res.text)  
        #用正则表达式提取AAE..AAA
        key = re.findall(r"base64,(.*)'\).format", text)[0]  

        # 获取转换编码格式后的网页原码
        dehtml = jm_help.decode58Fangchan(text, key)
        page = etree.HTML(dehtml)

        #获取页面信息

        #房源的url的id
        about_id=url.split('?')[0].split('/')[-1].split('.')[0].replace('x','')

        # 房源信息的来源在58同城上的url地址
        about_url=url

        #标题
        title=page.xpath('//h1[@class="c_333 f20 strongbox"]/text()')[0].strip()

        #租赁方式
        rent_way=page.xpath('//div[@class="house-desc-item fl c_333"]/ul/li[1]/span[2]/text()')[0]
        #租金
        money=page.xpath('//div[@class="house-pay-way f16"]/span[1]/b/text()')[0].strip()
        #支付方式
        payment_way=page.xpath('//div[@class="house-pay-way f16"]/span[2]/text()')[0].strip()
        # 房屋朝向
        direction=page.xpath('//div[@class="house-desc-item fl c_333"]/ul/li[3]/span[2]/text()')[0].split('  ')[0]
        # 房屋图片列表
        imglist=page.xpath('//ul[@class="house-pic-list "]//li/img/@lazy_src')
        # imglist=[re.sub('/',"\/",i) for i in imglist]
        imgs=json.dumps(imglist)
        print(imgs)
        
        #业主姓名
        owner=page.xpath('//p[@class="agent-name f16 pr"]/a/text()')[0].strip()

        # 房屋配置
        configuration=page.xpath('//ul[@class="house-disposal"]//li/text()')
        configuration=','.join(configuration)
        # configuration=str(configuration)
        # configuration=json.dumps(configuration)
        

        # 房屋的基本信息
        house_info=page.xpath('//div[@class="house-desc-item fl c_333"]/ul/li[2]/span[2]/text()')[0].split(' ')[0]
        model1=house_info[0]
        model2=house_info[2]
        model3=house_info[4]

        # 所属小区
        if page.xpath('//div[@class="district-decs fl"]/p/a/text()'):
            xq=page.xpath('//div[@class="district-decs fl"]/p/a/text()')[0].split(':')[-1].strip()
        else:
            xq='未知'

        ssqy='/'.join(page.xpath('//ul[@class="district-info-list c_333 f14 lh28"]/li[last()]/span//a/text()'))
        if page.xpath('//p[@class="addr c_555 f14"]/text()'):
            xxdz=page.xpath('//p[@class="addr c_555 f14"]/text()')[0].split('：')[-1].strip()
        else:
            xxdz='无'
        fwld_list=page.xpath('//ul[@class="introduce-item"]/li[1]/span[2]//em/text()')
        fwld=','.join(fwld_list)
        # fwld=json.dumps(fwld_list)
        if '出租要求' in page.xpath('//ul[@class="introduce-item"]//li/span[1]/text()'):
            czyq=page.xpath('//ul[@class="introduce-item"]/li[2]/span[2]//em/text()')
            czyq=','.join(czyq)
        else:
            czyq=''
        # czyq=json.dumps(czyq)
        if page.xpath('//ul[@class="introduce-item"]//li[last()]/span[2]/text()'):
            fyms=page.xpath('//ul[@class="introduce-item"]//li[last()]/span[2]/text()')[0].strip()
        else:
            fyms='无'

        #爬取的时间
        create_time=int(time.time())
        #状态
        status=1
        # print(configuration)
        print(mysqlhelp.query_data(title=title,owner=owner,xq=xq,money=money,configuration=configuration,fyms=fyms,rent_way=rent_way,house_type='rent'),'能不能往数据库存储')
        if mysqlhelp.query_data(title=title,owner=owner,xq=xq,money=money,configuration=configuration,fyms=fyms,rent_way=rent_way,house_type='rent'):
            print('准备插入数据库')
            sql='insert into db_houserent_sh(about_id,about_url,title,rent_way,money,payment_way,direction,imgs,owner,`configuration`,model1,model2,model3,xq,`ssqy`,`xxdz`,`fwld`,`czyq`,`fyms`,`status`,`create_time`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
            data=[about_id,about_url,title,rent_way,money,payment_way,direction,imgs,owner,configuration,model1,model2,model3,xq,ssqy,xxdz,fwld,czyq,fyms,status,create_time]
            # print(data,'data')
            house_id=mysqlhelp.sql_exec(sql,data)
            print(house_id,'房源id')
            # print('开始下载图片--------------------------------------------------------')
            # print(imglist,'图片url列表',type(imglist))
            # for img_url in imglist:
            #     time.sleep(random.randint(1,4))
            #     create_time=int(time.time())
            #     pic_path=self.down_img_pic(img_url,url,house_type='rent')
            #     sql='insert into db_picture2(path,status,create_time,housesecond_id) values(%s,%s,%s,%s)'
            #     data=[pic_path,1,create_time,house_id]
            #     mysqlhelp.sql_exec(sql,data)
            # print('图片下载完成')

        # print(about_id,'房源id')
        # print(about_url,'房源url')
        # print(title,'标题')
        # print(rent_way,'租借方式')
        # print(money,'租金')
        # print(payment_way,'付款方式')
        # print(direction,'房屋朝向')#---
        # print(imgs,'图片')
        # print(owner,'业主')
        # print(configuration_list,'房屋配置')#----
        # print(model1,'室')
        # print(model2,'厅')
        # print(model3,'卫')
        # print(xq,'小区')
        # print(ssqy,'所属区域')#---
        # print(xxdz,'详细地址')#---
        # print(fwld_list,'房屋亮点')#---
        # print(czyq,'出租要求')
        # print(fyms,'房屋描述')
        # print(status,'状态')
        # print(create_time,'创建时间')



# 代码的入口
if __name__ == "__main__":
    # path='./download'
    # if not os.path.exists(path):
    #     os.mkdir(path)
    url='https://sy.58.com/chuzu/0/pn1/'
    headers={
        'cookie':'f=n; commontopbar_new_city_info=188%7C%E6%B2%88%E9%98%B3%7Csy; f=n; commontopbar_new_city_info=188%7C%E6%B2%88%E9%98%B3%7Csy; time_create=1579137529481; userid360_xml=95ACC2EDA5D9DF3D3463242427FA5BFD; f=n; commontopbar_new_city_info=188%7C%E6%B2%88%E9%98%B3%7Csy; id58=e87rZl3mC5iLtxTfA0qZAg==; wmda_uuid=83b369d94097739f0fd218170fae2922; wmda_new_uuid=1; 58tj_uuid=907d287a-a241-4d7f-b4a9-82f5dc178ef4; als=0; xxzl_deviceid=Li4jM92s0F9MJHnfmW1z1NQ3xUQFJVP94fV2NomSldd3n%2B795%2BBDxO9iDZCQMdlN; Hm_lvt_ae019ebe194212c4486d09f377276a77=1575357349; 58home=sy; myLat=""; myLon=""; mcity=sy; ctid=39; aQQ_ajkguid=1737BD87-E303-12D4-8D33-SX1217091934; __xsptplus8=8.1.1576545583.1576545632.2%234%7C%7C%7C%7C%7C%23%234UPLQ5z-MySS0MJxtXGvRal3qitr3VFH%23; myfeet_tooltip=end; m58comvp=t18v115.159.229.14; Hm_lvt_295da9254bbc2518107d846e1641908e=1576646576; city=sy; param8616=1; param8716kop=1; Hm_lvt_a3013634de7e7a5d307653e15a0584cf=1577260386; wmda_visited_projects=%3B6333604277682%3B11187958619315%3B1731916484865; f=n; commontopbar_new_city_info=188%7C%E6%B2%88%E9%98%B3%7Csy; commontopbar_ipcity=sy%7C%E6%B2%88%E9%98%B3%7C0; xxzl_cid=629caa9d45574456b4659ab0481de6ea; xzuid=33e294c9-8204-4c58-a9f6-1fa708893b6c; defraudName=defraud; ppStore_fingerprint=37C4AA4FC5419C853B885C093FB1B48F92BDE760BC6D8D01%EF%BC%BF1577408663372; wmda_session_id_6333604277682=1577411697662-68c244c7-c7e5-17a8; new_uv=6; utm_source=; spm=; init_refer=https%253A%252F%252Fsy.58.com%252F; wmda_session_id_11187958619315=1577411733439-80453495-f2fe-fde1; new_session=0; JSESSIONID=0B379D04EB0A52F2D41C3A4A3D01CE13; xzfzqtoken=G1eAQvcVAJNGq%2BK31K0IXZBWecSWOAC0fYZbQgHU9yseCySfpiU7nz7Z6lBzbUAMin35brBb%2F%2FeSODvMgkQULA%3D%3D',
        'referer':'https://callback.58.com/antibot/verifycode?serialId=409f05bcbeeeff066c7091eb17b5fde9_f3ebe93edc1d424eaef31f731acf7403&code=22&sign=a62b68976cfa48a25f02db9b8797c8b8&namespace=ershoufanglistphp&url=https%3A%2F%2Fsy.58.com%2Fershoufang%2F0%2Fpn2%2F',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    }
    myrent=Rent_House(url,headers)
    myrent.run()

   
    





